; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s

; This testcase would fail on GFX908 due to not having a free VGPR available to
; copy between AGPRs.
define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT:    v_mov_b32_e32 v32, v1
; GFX908-NEXT:    v_mov_b32_e32 v33, v0
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; def v[0:31] a[0:15]
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a15
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a31, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a14
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a30, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a13
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a29, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a12
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a28, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a11
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a27, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a10
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a26, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a9
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a25, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a8
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a24, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a7
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a23, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a6
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a22, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a5
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a21, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a4
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a20, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a3
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a19, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a2
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a18, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a1
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a17, v39
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
; GFX908-NEXT:    s_nop 0
; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT:    s_nop 7
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a1 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v36, a13 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a14 ; Reload Reuse
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a2 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v34, a15 ; Reload Reuse
; GFX908-NEXT:    s_nop 0
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a3 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a4 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a5 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a6 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a7 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a8 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a9 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a10 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; copy
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a1
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a0, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT:    v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a13, v36 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a14, v35 ; Reload Reuse
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a1, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a2, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a3, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a4, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a5, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a6, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a7, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a8, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a9, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a10, v39 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a15, v34 ; Reload Reuse
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; copy
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v32, a2
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a3, v32
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; use a3 v[0:31]
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v33, v0
; GFX90A-NEXT:    v_mov_b32_e32 v32, v1
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:31] a[0:15]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a15
; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a14
; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a13
; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a12
; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a11
; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a10
; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a9
; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a8
; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a7
; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a6
; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a5
; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a4
; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a3
; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a2
; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a1
; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT:    s_nop 1
; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT:    s_nop 7
; GFX90A-NEXT:    s_nop 2
; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a10 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a13 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a14 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a15 ; Reload Reuse
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; copy
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a1
; GFX90A-NEXT:    buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT:    s_nop 0
; GFX90A-NEXT:    buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT:    s_waitcnt vmcnt(9)
; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v39 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v36 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v35 ; Reload Reuse
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v34 ; Reload Reuse
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; copy
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use a3 v[0:31]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
  %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"()
  %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
  %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
  %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
  %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
  %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
  call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
  ret void
}

; Check that we do make use of v32 if there are no AGPRs present in the function
define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
; GFX908-LABEL: no_agpr_no_reserve:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
; GFX908-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
; GFX908-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
; GFX908-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
; GFX908-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
; GFX908-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
; GFX908-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
; GFX908-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
; GFX908-NEXT:    s_waitcnt vmcnt(7)
; GFX908-NEXT:    v_add_u32_e32 v4, v4, v4
; GFX908-NEXT:    v_add_u32_e32 v3, v3, v3
; GFX908-NEXT:    v_add_u32_e32 v2, v2, v2
; GFX908-NEXT:    v_add_u32_e32 v1, v1, v1
; GFX908-NEXT:    s_waitcnt vmcnt(6)
; GFX908-NEXT:    v_add_u32_e32 v8, v8, v8
; GFX908-NEXT:    v_add_u32_e32 v7, v7, v7
; GFX908-NEXT:    v_add_u32_e32 v6, v6, v6
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_add_u32_e32 v32, v32, v32
; GFX908-NEXT:    v_add_u32_e32 v31, v31, v31
; GFX908-NEXT:    v_add_u32_e32 v30, v30, v30
; GFX908-NEXT:    v_add_u32_e32 v29, v29, v29
; GFX908-NEXT:    v_add_u32_e32 v5, v5, v5
; GFX908-NEXT:    v_add_u32_e32 v12, v12, v12
; GFX908-NEXT:    v_add_u32_e32 v11, v11, v11
; GFX908-NEXT:    v_add_u32_e32 v10, v10, v10
; GFX908-NEXT:    v_add_u32_e32 v9, v9, v9
; GFX908-NEXT:    v_add_u32_e32 v16, v16, v16
; GFX908-NEXT:    v_add_u32_e32 v15, v15, v15
; GFX908-NEXT:    v_add_u32_e32 v14, v14, v14
; GFX908-NEXT:    v_add_u32_e32 v13, v13, v13
; GFX908-NEXT:    v_add_u32_e32 v20, v20, v20
; GFX908-NEXT:    v_add_u32_e32 v19, v19, v19
; GFX908-NEXT:    v_add_u32_e32 v18, v18, v18
; GFX908-NEXT:    v_add_u32_e32 v17, v17, v17
; GFX908-NEXT:    v_add_u32_e32 v24, v24, v24
; GFX908-NEXT:    v_add_u32_e32 v23, v23, v23
; GFX908-NEXT:    v_add_u32_e32 v22, v22, v22
; GFX908-NEXT:    v_add_u32_e32 v21, v21, v21
; GFX908-NEXT:    v_add_u32_e32 v28, v28, v28
; GFX908-NEXT:    v_add_u32_e32 v27, v27, v27
; GFX908-NEXT:    v_add_u32_e32 v26, v26, v26
; GFX908-NEXT:    v_add_u32_e32 v25, v25, v25
; GFX908-NEXT:    global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
; GFX908-NEXT:    global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
; GFX908-NEXT:    global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
; GFX908-NEXT:    global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
; GFX908-NEXT:    global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
; GFX908-NEXT:    global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
; GFX908-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1]
; GFX908-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
; GFX908-NEXT:    s_endpgm
;
; GFX90A-LABEL: no_agpr_no_reserve:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
; GFX90A-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
; GFX90A-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
; GFX90A-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
; GFX90A-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
; GFX90A-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
; GFX90A-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
; GFX90A-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
; GFX90A-NEXT:    s_waitcnt vmcnt(7)
; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v3
; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v2
; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v1
; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v0
; GFX90A-NEXT:    s_waitcnt vmcnt(6)
; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v7
; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v6
; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v5
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_add_u32_e32 v31, v31, v31
; GFX90A-NEXT:    v_add_u32_e32 v30, v30, v30
; GFX90A-NEXT:    v_add_u32_e32 v29, v29, v29
; GFX90A-NEXT:    v_add_u32_e32 v28, v28, v28
; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v4
; GFX90A-NEXT:    v_add_u32_e32 v11, v11, v11
; GFX90A-NEXT:    v_add_u32_e32 v10, v10, v10
; GFX90A-NEXT:    v_add_u32_e32 v9, v9, v9
; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v8
; GFX90A-NEXT:    v_add_u32_e32 v15, v15, v15
; GFX90A-NEXT:    v_add_u32_e32 v14, v14, v14
; GFX90A-NEXT:    v_add_u32_e32 v13, v13, v13
; GFX90A-NEXT:    v_add_u32_e32 v12, v12, v12
; GFX90A-NEXT:    v_add_u32_e32 v19, v19, v19
; GFX90A-NEXT:    v_add_u32_e32 v18, v18, v18
; GFX90A-NEXT:    v_add_u32_e32 v17, v17, v17
; GFX90A-NEXT:    v_add_u32_e32 v16, v16, v16
; GFX90A-NEXT:    v_add_u32_e32 v23, v23, v23
; GFX90A-NEXT:    v_add_u32_e32 v22, v22, v22
; GFX90A-NEXT:    v_add_u32_e32 v21, v21, v21
; GFX90A-NEXT:    v_add_u32_e32 v20, v20, v20
; GFX90A-NEXT:    v_add_u32_e32 v27, v27, v27
; GFX90A-NEXT:    v_add_u32_e32 v26, v26, v26
; GFX90A-NEXT:    v_add_u32_e32 v25, v25, v25
; GFX90A-NEXT:    v_add_u32_e32 v24, v24, v24
; GFX90A-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
; GFX90A-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
; GFX90A-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
; GFX90A-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
; GFX90A-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
; GFX90A-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
; GFX90A-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
; GFX90A-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX90A-NEXT:    s_endpgm
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds <32 x i32>, ptr addrspace(1) %arg, i32 %id
  %load = load <32 x i32>, ptr addrspace(1) %gep
  %add = add <32 x i32> %load, %load
  store <32 x i32> %add, ptr addrspace(1) %gep
  ret void
}

; FIXME: This case is broken. The asm value passed in v32 is live
; through the range where the reserved def for the copy is introduced,
; clobbering the user value.
define void @v32_asm_def_use(float %v0, float %v1) #0 {
; GFX908-LABEL: v32_asm_def_use:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT:    v_mov_b32_e32 v33, v1
; GFX908-NEXT:    v_mov_b32_e32 v34, v0
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; def v[0:31] a[0:15]
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a15
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; def v32
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a31, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a14
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a30, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a13
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a29, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a12
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a28, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a11
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a27, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a10
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a26, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a9
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a25, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a8
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a24, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a7
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a23, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a6
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a22, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a5
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a21, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a4
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a20, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a3
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a19, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a2
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a18, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a1
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a17, v35
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a0
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a16, v35
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; copy
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a1
; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
; GFX908-NEXT:    s_nop 0
; GFX908-NEXT:    v_accvgpr_write_b32 a32, v35
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; copy
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    s_nop 7
; GFX908-NEXT:    v_accvgpr_read_b32 v33, a2
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a3, v33
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; use a3 v[0:31]
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; use v32
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v32_asm_def_use:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v34, v0
; GFX90A-NEXT:    v_mov_b32_e32 v33, v1
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:31] a[0:15]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a15
; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a14
; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a13
; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a12
; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a11
; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a10
; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a9
; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a8
; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a7
; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a6
; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a5
; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a4
; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a3
; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a2
; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a1
; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v32
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; copy
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a32 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a1
; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; copy
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v35 ; Reload Reuse
; GFX90A-NEXT:    s_nop 7
; GFX90A-NEXT:    s_nop 1
; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use a3 v[0:31]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use v32
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
  %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"()
  %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
  %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
  %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
  %v32 = call i32 asm sideeffect "; def $0","=${v32}"()
  %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
  %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
  call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
  call void asm sideeffect "; use $0","${v32}"(i32 %v32)
  ret void
}

define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 {
; GFX908-LABEL: introduced_copy_to_sgpr:
; GFX908:       ; %bb.0: ; %bb
; GFX908-NEXT:    global_load_ushort v8, v[0:1], off glc
; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX908-NEXT:    s_mov_b32 s9, 0
; GFX908-NEXT:    s_load_dword s4, s[4:5], 0x18
; GFX908-NEXT:    v_mov_b32_e32 v11, 0
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s3
; GFX908-NEXT:    s_sub_i32 s5, 0, s3
; GFX908-NEXT:    v_cvt_f32_f16_e32 v9, s4
; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v0
; GFX908-NEXT:    v_mov_b32_e32 v0, 0
; GFX908-NEXT:    v_mov_b32_e32 v1, 0
; GFX908-NEXT:    v_readfirstlane_b32 s8, v2
; GFX908-NEXT:    s_mul_i32 s5, s5, s8
; GFX908-NEXT:    s_mul_hi_u32 s5, s8, s5
; GFX908-NEXT:    s_add_i32 s8, s8, s5
; GFX908-NEXT:    s_mul_hi_u32 s5, s2, s8
; GFX908-NEXT:    s_mul_i32 s8, s5, s3
; GFX908-NEXT:    s_sub_i32 s2, s2, s8
; GFX908-NEXT:    s_add_i32 s10, s5, 1
; GFX908-NEXT:    s_sub_i32 s8, s2, s3
; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
; GFX908-NEXT:    s_cselect_b32 s5, s10, s5
; GFX908-NEXT:    s_cselect_b32 s2, s8, s2
; GFX908-NEXT:    s_add_i32 s8, s5, 1
; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
; GFX908-NEXT:    s_cselect_b32 s8, s8, s5
; GFX908-NEXT:    s_lshr_b32 s10, s4, 16
; GFX908-NEXT:    v_cvt_f32_f16_e32 v10, s10
; GFX908-NEXT:    s_lshl_b64 s[10:11], s[8:9], 5
; GFX908-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
; GFX908-NEXT:    s_lshl_b64 s[4:5], s[6:7], 5
; GFX908-NEXT:    s_or_b32 s4, s4, 28
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_readfirstlane_b32 s9, v8
; GFX908-NEXT:    s_and_b32 s9, 0xffff, s9
; GFX908-NEXT:    s_mul_i32 s1, s1, s9
; GFX908-NEXT:    s_mul_hi_u32 s12, s0, s9
; GFX908-NEXT:    s_mul_i32 s0, s0, s9
; GFX908-NEXT:    s_add_i32 s1, s12, s1
; GFX908-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
; GFX908-NEXT:    s_branch .LBB3_2
; GFX908-NEXT:  .LBB3_1: ; %Flow56
; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
; GFX908-NEXT:  .LBB3_2: ; %bb9
; GFX908-NEXT:    ; =>This Loop Header: Depth=1
; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
; GFX908-NEXT:    s_mov_b64 s[14:15], -1
; GFX908-NEXT:    s_cbranch_scc0 .LBB3_10
; GFX908-NEXT:  ; %bb.3: ; %bb14
; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
; GFX908-NEXT:    v_cmp_lt_i64_e64 s[12:13], s[6:7], 0
; GFX908-NEXT:    v_cmp_gt_i64_e64 s[14:15], s[6:7], -1
; GFX908-NEXT:    v_mov_b32_e32 v12, 0
; GFX908-NEXT:    s_mov_b64 s[18:19], s[4:5]
; GFX908-NEXT:    v_mov_b32_e32 v18, 0
; GFX908-NEXT:    v_mov_b32_e32 v17, 0
; GFX908-NEXT:    v_mov_b32_e32 v16, 0
; GFX908-NEXT:    v_mov_b32_e32 v15, 0
; GFX908-NEXT:    v_mov_b32_e32 v14, 0
; GFX908-NEXT:    v_mov_b32_e32 v13, 0
; GFX908-NEXT:    v_mov_b32_e32 v19, 0
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_readfirstlane_b32 s9, v2
; GFX908-NEXT:    v_readfirstlane_b32 s16, v3
; GFX908-NEXT:    s_add_u32 s9, s9, 1
; GFX908-NEXT:    s_addc_u32 s17, s16, 0
; GFX908-NEXT:    s_mul_hi_u32 s20, s2, s9
; GFX908-NEXT:    s_mul_i32 s21, s3, s9
; GFX908-NEXT:    s_mul_i32 s16, s2, s9
; GFX908-NEXT:    s_mul_i32 s9, s2, s17
; GFX908-NEXT:    s_add_i32 s9, s20, s9
; GFX908-NEXT:    s_add_i32 s9, s9, s21
; GFX908-NEXT:    s_branch .LBB3_5
; GFX908-NEXT:  .LBB3_4: ; %bb58
; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX908-NEXT:    s_add_u32 s18, s18, s0
; GFX908-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
; GFX908-NEXT:    s_addc_u32 s19, s19, s1
; GFX908-NEXT:    s_mov_b64 s[20:21], 0
; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
; GFX908-NEXT:  .LBB3_5: ; %bb16
; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX908-NEXT:    s_add_u32 s20, s18, s16
; GFX908-NEXT:    s_addc_u32 s21, s19, s9
; GFX908-NEXT:    global_load_dword v21, v11, s[20:21] offset:-12 glc
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    global_load_dword v20, v11, s[20:21] offset:-8 glc
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    global_load_dword v4, v11, s[20:21] offset:-4 glc
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    global_load_dword v4, v11, s[20:21] glc
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    ds_read_b64 v[4:5], v11
; GFX908-NEXT:    ds_read_b64 v[6:7], v0
; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
; GFX908-NEXT:    s_cbranch_vccnz .LBB3_7
; GFX908-NEXT:  ; %bb.6: ; %bb51
; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT:    v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX908-NEXT:    v_cvt_f32_f16_e32 v21, v21
; GFX908-NEXT:    v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX908-NEXT:    v_cvt_f32_f16_e32 v20, v20
; GFX908-NEXT:    v_add_f32_e32 v24, v10, v5
; GFX908-NEXT:    v_add_f32_e32 v25, v9, v4
; GFX908-NEXT:    v_add_f32_e32 v26, 0, v5
; GFX908-NEXT:    v_add_f32_e32 v27, 0, v4
; GFX908-NEXT:    v_add_f32_e32 v7, v22, v7
; GFX908-NEXT:    v_add_f32_e32 v6, v21, v6
; GFX908-NEXT:    v_add_f32_e32 v5, v23, v5
; GFX908-NEXT:    v_add_f32_e32 v4, v20, v4
; GFX908-NEXT:    v_add_f32_e32 v12, v12, v25
; GFX908-NEXT:    v_add_f32_e32 v18, v18, v24
; GFX908-NEXT:    v_add_f32_e32 v17, v17, v27
; GFX908-NEXT:    v_add_f32_e32 v16, v16, v26
; GFX908-NEXT:    v_add_f32_e32 v15, v15, v6
; GFX908-NEXT:    v_add_f32_e32 v14, v14, v7
; GFX908-NEXT:    v_add_f32_e32 v13, v13, v4
; GFX908-NEXT:    v_add_f32_e32 v19, v19, v5
; GFX908-NEXT:    s_mov_b64 s[20:21], -1
; GFX908-NEXT:    s_branch .LBB3_4
; GFX908-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT:    s_mov_b64 s[20:21], s[12:13]
; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT:    ; implicit-def: $vgpr19
; GFX908-NEXT:    ; implicit-def: $vgpr13
; GFX908-NEXT:    ; implicit-def: $vgpr14
; GFX908-NEXT:    ; implicit-def: $vgpr15
; GFX908-NEXT:    ; implicit-def: $vgpr16
; GFX908-NEXT:    ; implicit-def: $vgpr17
; GFX908-NEXT:    ; implicit-def: $vgpr18
; GFX908-NEXT:    ; implicit-def: $vgpr12
; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT:    s_xor_b64 s[14:15], s[20:21], -1
; GFX908-NEXT:  .LBB3_10: ; %Flow55
; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT:    s_mov_b64 s[12:13], -1
; GFX908-NEXT:    s_and_b64 vcc, exec, s[14:15]
; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
; GFX908-NEXT:  ; %bb.11: ; %bb12
; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT:    s_add_u32 s6, s6, s8
; GFX908-NEXT:    s_addc_u32 s7, s7, 0
; GFX908-NEXT:    s_add_u32 s4, s4, s10
; GFX908-NEXT:    s_addc_u32 s5, s5, s11
; GFX908-NEXT:    s_mov_b64 s[12:13], 0
; GFX908-NEXT:    s_branch .LBB3_1
; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
; GFX908-NEXT:    s_endpgm
;
; GFX90A-LABEL: introduced_copy_to_sgpr:
; GFX90A:       ; %bb.0: ; %bb
; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX90A-NEXT:    s_mov_b32 s9, 0
; GFX90A-NEXT:    s_load_dword s4, s[4:5], 0x18
; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s4
; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
; GFX90A-NEXT:    s_mul_i32 s5, s5, s8
; GFX90A-NEXT:    s_mul_hi_u32 s5, s8, s5
; GFX90A-NEXT:    s_add_i32 s8, s8, s5
; GFX90A-NEXT:    s_mul_hi_u32 s5, s2, s8
; GFX90A-NEXT:    s_mul_i32 s8, s5, s3
; GFX90A-NEXT:    s_sub_i32 s2, s2, s8
; GFX90A-NEXT:    s_add_i32 s10, s5, 1
; GFX90A-NEXT:    s_sub_i32 s8, s2, s3
; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
; GFX90A-NEXT:    s_cselect_b32 s5, s10, s5
; GFX90A-NEXT:    s_cselect_b32 s2, s8, s2
; GFX90A-NEXT:    s_add_i32 s8, s5, 1
; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
; GFX90A-NEXT:    s_cselect_b32 s8, s8, s5
; GFX90A-NEXT:    s_lshr_b32 s10, s4, 16
; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, s10
; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[8:9], 5
; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[6:7], 5
; GFX90A-NEXT:    s_or_b32 s4, s4, 28
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_readfirstlane_b32 s9, v18
; GFX90A-NEXT:    s_and_b32 s9, 0xffff, s9
; GFX90A-NEXT:    s_mul_i32 s1, s1, s9
; GFX90A-NEXT:    s_mul_hi_u32 s12, s0, s9
; GFX90A-NEXT:    s_mul_i32 s0, s0, s9
; GFX90A-NEXT:    s_add_i32 s1, s12, s1
; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
; GFX90A-NEXT:    s_branch .LBB3_2
; GFX90A-NEXT:  .LBB3_1: ; %Flow56
; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
; GFX90A-NEXT:  .LBB3_2: ; %bb9
; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
; GFX90A-NEXT:    s_mov_b64 s[14:15], -1
; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_10
; GFX90A-NEXT:  ; %bb.3: ; %bb14
; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[12:13], s[6:7], 0
; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[14:15], s[6:7], -1
; GFX90A-NEXT:    s_mov_b64 s[18:19], s[4:5]
; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
; GFX90A-NEXT:    v_mov_b32_e32 v12, v6
; GFX90A-NEXT:    v_mov_b32_e32 v13, v6
; GFX90A-NEXT:    v_mov_b32_e32 v10, v6
; GFX90A-NEXT:    v_mov_b32_e32 v11, v6
; GFX90A-NEXT:    v_mov_b32_e32 v8, v6
; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_readfirstlane_b32 s9, v4
; GFX90A-NEXT:    v_readfirstlane_b32 s16, v5
; GFX90A-NEXT:    s_add_u32 s9, s9, 1
; GFX90A-NEXT:    s_addc_u32 s17, s16, 0
; GFX90A-NEXT:    s_mul_hi_u32 s20, s2, s9
; GFX90A-NEXT:    s_mul_i32 s21, s3, s9
; GFX90A-NEXT:    s_mul_i32 s16, s2, s9
; GFX90A-NEXT:    s_mul_i32 s9, s2, s17
; GFX90A-NEXT:    s_add_i32 s9, s20, s9
; GFX90A-NEXT:    s_add_i32 s9, s9, s21
; GFX90A-NEXT:    s_branch .LBB3_5
; GFX90A-NEXT:  .LBB3_4: ; %bb58
; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX90A-NEXT:    s_add_u32 s18, s18, s0
; GFX90A-NEXT:    s_addc_u32 s19, s19, s1
; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
; GFX90A-NEXT:    s_mov_b64 s[20:21], 0
; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
; GFX90A-NEXT:    s_cbranch_vccz .LBB3_9
; GFX90A-NEXT:  .LBB3_5: ; %bb16
; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT:    s_add_u32 s20, s18, s16
; GFX90A-NEXT:    s_addc_u32 s21, s19, s9
; GFX90A-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] offset:-4 glc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] glc
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
; GFX90A-NEXT:    ; kill: killed $sgpr20 killed $sgpr21
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_7
; GFX90A-NEXT:  ; %bb.6: ; %bb51
; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT:    v_cvt_f32_f16_e32 v22, v21
; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT:    v_cvt_f32_f16_e32 v20, v20
; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[0:1], v[14:15]
; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0]
; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[26:27]
; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[14:15]
; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
; GFX90A-NEXT:    s_branch .LBB3_4
; GFX90A-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT:    s_mov_b64 s[20:21], s[12:13]
; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT:    ; implicit-def: $vgpr9
; GFX90A-NEXT:    ; implicit-def: $vgpr11
; GFX90A-NEXT:    ; implicit-def: $vgpr13
; GFX90A-NEXT:    ; implicit-def: $vgpr7
; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT:    s_xor_b64 s[14:15], s[20:21], -1
; GFX90A-NEXT:  .LBB3_10: ; %Flow55
; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT:    s_mov_b64 s[12:13], -1
; GFX90A-NEXT:    s_and_b64 vcc, exec, s[14:15]
; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
; GFX90A-NEXT:  ; %bb.11: ; %bb12
; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT:    s_add_u32 s6, s6, s8
; GFX90A-NEXT:    s_addc_u32 s7, s7, 0
; GFX90A-NEXT:    s_add_u32 s4, s4, s10
; GFX90A-NEXT:    s_addc_u32 s5, s5, s11
; GFX90A-NEXT:    s_mov_b64 s[12:13], 0
; GFX90A-NEXT:    s_branch .LBB3_1
; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
; GFX90A-NEXT:    s_endpgm
bb:
  %i = load volatile i16, ptr addrspace(4) undef, align 2
  %i6 = zext i16 %i to i64
  %i7 = udiv i32 %arg1, %arg2
  %i8 = zext i32 %i7 to i64
  br label %bb9

bb9:                                              ; preds = %bb12, %bb
  %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
  br i1 undef, label %bb14, label %bb12

bb12:                                             ; preds = %bb58, %bb9
  %i13 = add nuw nsw i64 %i10, %i8
  br label %bb9

bb14:                                             ; preds = %bb9
  %i11 = icmp slt i64 %i10, 0
  %i15 = load i64, ptr addrspace(1) null, align 8
  br label %bb16

bb16:                                             ; preds = %bb58, %bb14
  %i17 = phi i64 [ %i65, %bb58 ], [ %i15, %bb14 ]
  %i18 = phi <2 x float> [ %i59, %bb58 ], [ zeroinitializer, %bb14 ]
  %i19 = phi <2 x float> [ %i60, %bb58 ], [ zeroinitializer, %bb14 ]
  %i20 = phi <2 x float> [ %i61, %bb58 ], [ zeroinitializer, %bb14 ]
  %i21 = phi <2 x float> [ %i62, %bb58 ], [ zeroinitializer, %bb14 ]
  %i22 = add nsw i64 %i17, 1
  %i23 = mul nsw i64 %i22, %arg
  %i24 = add nsw i64 %i23, %i10
  %i25 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 8
  %i27 = load volatile <2 x half>, ptr addrspace(1) %i25, align 16
  %i28 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 10
  %i30 = load volatile <2 x half>, ptr addrspace(1) %i28, align 4
  %i31 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 12
  %i33 = load volatile <2 x half>, ptr addrspace(1) %i31, align 8
  %i34 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 14
  %i36 = load volatile <2 x half>, ptr addrspace(1) %i34, align 4
  %i43 = load volatile <2 x float>, ptr addrspace(3) null, align 8
  %i46 = load volatile <2 x float>, ptr addrspace(3) undef, align 32
  fence syncscope("workgroup") acquire
  br i1 %i11, label %bb58, label %bb51

bb51:                                             ; preds = %bb16
  %i37 = fpext <2 x half> %arg4 to <2 x float>
  %i39 = fpext <2 x half> %i27 to <2 x float>
  %i40 = fpext <2 x half> %i30 to <2 x float>
  %i41 = fpext <2 x half> %i33 to <2 x float>
  %i42 = fpext <2 x half> %i36 to <2 x float>
  %i44 = fadd contract <2 x float> %i37, %i43
  %i45 = fadd contract <2 x float> %i43, zeroinitializer
  %i47 = fadd contract <2 x float> %i39, %i46
  %i48 = fadd contract <2 x float> %i40, %i43
  %i49 = fadd contract <2 x float> %i41, zeroinitializer
  %i50 = fadd contract <2 x float> %i42, zeroinitializer
  %i52 = fadd contract <2 x float> %i18, %i44
  %i53 = fadd contract <2 x float> %i19, %i45
  %i54 = fadd contract <2 x float> %i20, %i47
  %i55 = fadd contract <2 x float> %i21, %i48
  %i56 = fadd contract <2 x float> %i49, zeroinitializer
  %i57 = fadd contract <2 x float> %i50, zeroinitializer
  br label %bb58

bb58:                                             ; preds = %bb51, %bb16
  %i59 = phi <2 x float> [ %i18, %bb16 ], [ %i52, %bb51 ]
  %i60 = phi <2 x float> [ %i19, %bb16 ], [ %i53, %bb51 ]
  %i61 = phi <2 x float> [ %i20, %bb16 ], [ %i54, %bb51 ]
  %i62 = phi <2 x float> [ %i21, %bb16 ], [ %i55, %bb51 ]
  %i63 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i56, %bb51 ]
  %i64 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i57, %bb51 ]
  %i65 = add nsw i64 %i17, %i6
  %i66 = icmp slt i64 %i65, 0
  br i1 %i66, label %bb16, label %bb12
}

; This testcase would fail on GFX908 due to not having a free VGPR available to
; copy SGPR to AGPR.
define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
; GFX908:       ; %bb.0:
; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT:    v_mov_b32_e32 v32, v1
; GFX908-NEXT:    v_mov_b32_e32 v33, v0
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; def v[0:31] s[0:15]
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_mov_b32_e32 v39, s15
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a31, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s14
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a30, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s13
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a29, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s12
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a28, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s11
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a27, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s10
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a26, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s9
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a25, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s8
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a24, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s7
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a23, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s6
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a22, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s5
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a21, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s4
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a20, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s3
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a19, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s2
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a18, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s1
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a17, v39
; GFX908-NEXT:    v_mov_b32_e32 v39, s0
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
; GFX908-NEXT:    s_nop 0
; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX908-NEXT:    s_nop 7
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a1 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v36, a13 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v35, a14 ; Reload Reuse
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a2 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_read_b32 v34, a15 ; Reload Reuse
; GFX908-NEXT:    s_nop 0
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a3 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a4 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a5 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a6 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a7 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a8 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a9 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a10 ; Reload Reuse
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; copy
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v39, a1
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a32, v39
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a0, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT:    v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a12, v37 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a13, v36 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a14, v35 ; Reload Reuse
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a1, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a2, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a3, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a4, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a5, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a6, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a7, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a8, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a9, v39 ; Reload Reuse
; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX908-NEXT:    s_waitcnt vmcnt(0)
; GFX908-NEXT:    v_accvgpr_write_b32 a10, v39 ; Reload Reuse
; GFX908-NEXT:    v_accvgpr_write_b32 a15, v34 ; Reload Reuse
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; copy
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    v_accvgpr_read_b32 v32, a2
; GFX908-NEXT:    s_nop 1
; GFX908-NEXT:    v_accvgpr_write_b32 a3, v32
; GFX908-NEXT:    ;;#ASMSTART
; GFX908-NEXT:    ; use a3 v[0:31]
; GFX908-NEXT:    ;;#ASMEND
; GFX908-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v33, v0
; GFX90A-NEXT:    v_mov_b32_e32 v32, v1
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:31] s[0:15]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s14
; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s13
; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s12
; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s11
; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s10
; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s9
; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s8
; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s7
; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s6
; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s5
; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s4
; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s3
; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s2
; GFX90A-NEXT:    v_accvgpr_write_b32 a17, s1
; GFX90A-NEXT:    v_accvgpr_write_b32 a16, s0
; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a32 ; Reload Reuse
; GFX90A-NEXT:    s_nop 0
; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
; GFX90A-NEXT:    s_nop 7
; GFX90A-NEXT:    s_nop 2
; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX90A-NEXT:    buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a11 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v38, a12 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v37, a13 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a14 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a15 ; Reload Reuse
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; copy
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a1
; GFX90A-NEXT:    buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX90A-NEXT:    s_nop 0
; GFX90A-NEXT:    buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT:    buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX90A-NEXT:    s_waitcnt vmcnt(10)
; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v39 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v38 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v37 ; Reload Reuse
; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v36 ; Reload Reuse
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v35 ; Reload Reuse
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; copy
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a2
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use a3 v[0:31]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v34 ; Reload Reuse
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
  %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"()
  %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
  %agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
  %mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
  %agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
  %agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
  call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
  ret void
}

declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
declare i32 @llvm.amdgcn.workitem.id.x() #2

attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
attributes #1 = { convergent nounwind readnone willreturn }
attributes #2 = { nounwind readnone willreturn }
attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
